last changed: 2023-08-29


Metascape Enrichment

Data processing

At first, the correlation matrices are required and for simplicity the peak positions are changed to gene symbols.

> #H3K27ac:
> m.H3.1ac <- readRDS("./../Compare_H3K27ac_RNAseq/Data/m_H3.1K27ac_median.rds")
> head(m.H3.1ac)
##                         T0h      T1h      T2h      T4h
## chr1:181453-181717 3.967340 3.759846 3.836270 3.779723
## chr1:777858-779502 6.843831 6.703328 6.579330 6.871074
## chr1:827213-827667 4.894720 4.822416 5.073162 4.769361
## chr1:903806-905645 7.300799 7.406745 7.178423 7.253228
## chr1:920444-922078 5.087611 6.262964 5.091975 4.968098
## chr1:923270-926516 8.334292 8.984147 8.254235 8.021373
> m.H3.3ac <- readRDS("./../Compare_H3K27ac_RNAseq/Data/m_H3.3K27ac_median.rds")
> head(m.H3.3ac)
##                         T0h      T1h      T2h      T4h
## chr1:181453-181717 4.338979 4.475285 4.704645 4.461679
## chr1:777858-779502 7.191532 7.140648 6.606008 6.555047
## chr1:827213-827667 4.715587 4.437999 4.523847 4.657809
## chr1:903806-905645 7.245065 7.791045 7.397244 7.279499
## chr1:920444-922078 5.713139 6.483811 5.428020 5.093662
## chr1:923270-926516 8.631627 9.509342 8.500228 8.103128
> peaks.H3ac <- readRDS("./../Compare_H3K27ac_RNAseq/Data/H3K27ac_coding_peaks_1kb.rds") %>% 
+   as.data.frame()
> peaks.H3ac <- mutate(peaks.H3ac, peak_pos = paste0(peaks.H3ac$chr, ":", peaks.H3ac$start, "-", peaks.H3ac$end))
> peaks.H3ac <- peaks.H3ac[order(peaks.H3ac$distanceToTSS %>% as.integer() %>% abs(), decreasing = F), ]
> peaks.H3ac <- peaks.H3ac[!duplicated(peaks.H3ac$gene_symbol),]
> peaks.H3ac <- peaks.H3ac[!is.na(peaks.H3ac$gene_symbol), ]
> rownames(peaks.H3ac) <- peaks.H3ac$gene_symbol
> paged_table(peaks.H3ac)
> #RNAseq:
> m.RNAseq <- readRDS("./../Compare_H3K27ac_RNAseq/Data/m_RNAseq_median.rds")
> head(m.RNAseq)
##                       T0h       T1h       T2h       T4h
## ENSG00000187961  9.472383  9.373587  9.318045  9.252606
## ENSG00000187583  6.887230  6.861752  6.803387  6.664795
## ENSG00000272512  3.451118  3.485096  3.553123  3.609632
## ENSG00000157933 10.105359 10.144625 10.408214 10.423799
## ENSG00000158286  6.990695  6.825305  6.717835  6.642559
## ENSG00000049249  6.356913  6.404449  6.662290  6.626253

The enrichment analysis was performed with Metascape. Here, a list with all genes for each cluster is required which was uploaded and used to perform the enrichment by cluster (NAs need to be excluded).

> clus.genes <- readRDS("./../Enrichment_analysis/Data/RNAseq_genes_cluster.rds")
> lapply(clus.genes, head)
## $`1`
## [1] "ENSG00000116685" "ENSG00000186501" "ENSG00000130772" "ENSG00000162419"
## [5] "ENSG00000196182" "ENSG00000066135"
## 
## $`2`
## [1] "ENSG00000187583" "ENSG00000142798" "ENSG00000133216" "ENSG00000088280"
## [5] "ENSG00000158246" "ENSG00000142733"
## 
## $`3`
## [1] "ENSG00000187961" "ENSG00000158286" "ENSG00000162496" "ENSG00000160094"
## [5] "ENSG00000142687" "ENSG00000179862"
## 
## $`4`
## [1] "ENSG00000272512" "ENSG00000157933" "ENSG00000049249" "ENSG00000171608"
## [5] "ENSG00000197312" "ENSG00000065526"
> # make data.frame with required structure:
> l1 <- length(clus.genes[[1]])
> l2 <- length(clus.genes[[2]])
> l3 <- length(clus.genes[[3]])
> l4 <- length(clus.genes[[4]])
> 
> max.length <- max(l1, l2, l3, l4)
> 
> meta.genes <- data.frame(cluster_1 = c(clus.genes[[1]], rep(NA, max.length - l1)), 
+                          cluster_2 = c(clus.genes[[2]], rep(NA, max.length - l2)),
+                          cluster_3 = c(clus.genes[[3]], rep(NA, max.length - l3)),
+                          cluster_4 = c(clus.genes[[4]], rep(NA, max.length - l4)))
> 
> paged_table(meta.genes)

Metascape results

Here, the summarized results of the Metascape enrichment analysis are shown:

Metascape enrichment results


Here, the enrichment results are shown as a table:

> meta.data <- read_excel("./Data/metascape_result.xlsx") %>% as.data.frame()
> rownames(meta.data) <- meta.data$original_id
> 
> paged_table(meta.data)
> meta.enrich <- read_excel("./Data/metascape_result.xlsx", sheet = 2)
> 
> paged_table(meta.enrich)

Correlation Analysis

Find enriched genes in top pathways

The top hits of the enrichment were then used to get the enriched genes in the respective pathway. For this the results table of the Metascape enrichment was used.

> genes.hsv.infec <- str_split_1(meta.enrich[2, "Symbols"] %>% as.character(), ",") 
> 
> genes.cell.resp.stress <-  str_split_1(meta.enrich[15, "Symbols"] %>% as.character(), ",")
> 
> genes.TNF.sign.pw <-  str_split_1(meta.enrich[25, "Symbols"] %>% as.character(), ",")

These gene list were then used to construct a data frame which contains information about if there is a significant H3K27ac peak or not.

> RNAseq.genes <- data.frame(ensembl = meta.data$original_id, symbol = meta.data$`Gene Symbol`)
> rownames(RNAseq.genes) <- RNAseq.genes$symbol
> 
> corr.peaks <- ifelse(RNAseq.genes$symbol == peaks.H3ac[RNAseq.genes$symbol, "gene_symbol"], 
+                      peaks.H3ac[RNAseq.genes$symbol, "peak_pos"], 
+                      NA)
> 
> corr.T.F <- ifelse(is.na(corr.peaks), "No", "Yes")
> 
> RNAseq.genes <- mutate(RNAseq.genes, peaks = corr.peaks, corr_peak = corr.T.F %>% as.factor)
> 
> pw.m.RNAseq <- rbind(RNAseq.genes[genes.hsv.infec ,], 
+                     RNAseq.genes[genes.cell.resp.stress ,], 
+                     RNAseq.genes[genes.TNF.sign.pw ,])
> 
> pw <- c(rep("Herpes simplex virus 1 infection", length(genes.hsv.infec)),
+         rep("Regulation of cellular response to stress", length(genes.cell.resp.stress)),
+         rep("TNF signaling pathway", length(genes.TNF.sign.pw)))
> 
> pw.m.RNAseq <- mutate(pw.m.RNAseq, pathway = pw)
> 
> paged_table(pw.m.RNAseq)

The results were then plotted as follows:

> ggplot(pw.m.RNAseq) +
+   geom_bar(aes(y = pathway, fill = corr_peak), position = "fill", orientation = "y") +
+   labs(y = "", fill = "H3K27ac peak") +
+   theme_bw()


Calculate Correlation

These pathways were then used to calculate the correlation of the respective gene expression with the H3K27ac.

> corr.H3.1 <- corr.H3.3 <- list(HSV = NA, CRS = NA, TNF = NA)
> 
> list.pw <- list(genes.hsv.infec, genes.cell.resp.stress, genes.TNF.sign.pw)
> 
> for (pw in 1:3) {
+   corr.all.H3.1 <- corr.all.H3.3 <- c()
+   curr.genes <- list.pw[[pw]]
+   curr.ensembl <- pw.m.RNAseq[curr.genes, "ensembl"]
+   curr.peaks <- pw.m.RNAseq[curr.genes, "peaks"]
+   for (i in 1:length(list.pw[[pw]])) {
+     if (is.na(pw.m.RNAseq[curr.genes[i], "peaks"]) == T) {
+       curr.corr.H3.1 <- NA
+       curr.corr.H3.3 <- NA
+     }
+     else {
+       curr.corr.H3.1 <- cor(m.H3.1ac[curr.peaks[i], ], m.RNAseq[curr.ensembl[i], ], method = "pearson")
+       curr.corr.H3.3 <- cor(m.H3.3ac[curr.peaks[i], ], m.RNAseq[curr.ensembl[i], ], method = "pearson")
+     }
+   corr.all.H3.1 <- c(corr.all.H3.1, curr.corr.H3.1)
+   corr.all.H3.3 <- c(corr.all.H3.3, curr.corr.H3.3)
+   }
+ names(corr.all.H3.1) <- names(corr.all.H3.3) <- curr.genes
+ corr.H3.1[[pw]] <- corr.all.H3.1
+ corr.H3.3[[pw]] <- corr.all.H3.3
+ }

Overview of correlation results

Next, a data frame was constructed containing all the required information to plot correlation results. These data was again annotated to the respective construct (H3.1 or H3.3).

> # construct dataframe for each of the constructs
> pw.corr.RNAseq <- mutate(rbind(pw.m.RNAseq, pw.m.RNAseq), 
+                          R = c(corr.H3.1[["HSV"]], corr.H3.1[["CRS"]], corr.H3.1[["TNF"]], 
+                                corr.H3.3[["HSV"]], corr.H3.3[["CRS"]], corr.H3.3[["TNF"]]), 
+                          experiment = rep(c("H3.1K27ac", "H3.3K27ac"), each = nrow(pw.m.RNAseq)) %>% as.factor(), 
+                          info.corr = NA)
> 
> pw.corr.RNAseq$symbol <- pw.corr.RNAseq$symbol %>% as.factor
> pw.corr.RNAseq$pathway <- pw.corr.RNAseq$pathway %>% as.factor
> 
> # add factor for corr:
> pw.corr.RNAseq[which(pw.corr.RNAseq$corr_peak == "No"), "info.corr"] <- "no peak"
> pw.corr.RNAseq[which(pw.corr.RNAseq$R %>% abs() <= 0.7), "info.corr"] <- "-0.7 < R < 0.7"
> pw.corr.RNAseq[which(pw.corr.RNAseq$R <= -0.7), "info.corr"] <- "R <= -0.7"
> pw.corr.RNAseq[which(pw.corr.RNAseq$R >= 0.7), "info.corr"] <- "R >= 0.7"
> pw.corr.RNAseq$info.corr <- pw.corr.RNAseq$info.corr %>% as.factor()
> 
> paged_table(pw.corr.RNAseq)

To plot the data as a pie chart the counts of the certain events are required, which were calculated as follows:

> pw.corr.H3.1 <- pw.corr.RNAseq[which(pw.corr.RNAseq$experiment == "H3.1K27ac"), ]
> pw.corr.H3.3 <- pw.corr.RNAseq[which(pw.corr.RNAseq$experiment == "H3.3K27ac"), ]
> 
> pie.RNAseq <- data.frame(count = c(
+   length(which(pw.corr.H3.1$info.corr == "no peak")),
+   length(which(pw.corr.H3.1$info.corr == "-0.7 < R < 0.7")),
+   length(which(pw.corr.H3.1$info.corr == "R <= -0.7")),
+   length(which(pw.corr.H3.1$info.corr == "R >= 0.7")),
+   length(which(pw.corr.H3.3$info.corr == "no peak")),
+   length(which(pw.corr.H3.3$info.corr == "-0.7 < R < 0.7")),
+   length(which(pw.corr.H3.3$info.corr == "R <= -0.7")),
+   length(which(pw.corr.H3.3$info.corr == "R >= 0.7"))),
+   group = rep(c("no peak", "-0.7 < R < 0.7", "R <= -0.7", "R >= 0.7"), 1),
+   experiment = rep(c("H3.1K27ac", "H3.3K27ac"), each = 4))
> 
> paged_table(pie.RNAseq)

The results were then plotted as a pie chart.

> ggplot(pie.RNAseq) +
+   geom_bar(aes(x = "", y = count, fill = group), 
+            stat = "identity",
+            position = "fill",
+            width = 1, 
+            color = "white") +
+   coord_polar("y", start = 0) +
+   labs(x = "", y = "", 
+        title = "Correlation between RNAseq and H3K27ac", 
+        fill = "Correlation") +
+   facet_wrap(~ experiment) +
+   theme_minimal()


Correlation by pathway

To show the correlation of each gene in the analyzed pathways, only those genes with a correlation of R >= 0.7 or <= -0.7 were used. These were selected and ordered as follows:

> # filter for R >= 0.7 / <= -0.7:
> pw.corr0.7 <- which(pw.corr.H3.1$R %>% abs() >= 0.7 | pw.corr.H3.3$R %>% abs() >= 0.7)
> 
> pw.corr0.7.H3.1 <- pw.corr.H3.1[pw.corr0.7 ,]
> pw.corr0.7.H3.3 <- pw.corr.H3.3[pw.corr0.7 ,]
> 
> pw.corr.0.7.RNAseq <- rbind(pw.corr0.7.H3.1, pw.corr0.7.H3.3)
> 
> #seperate data by pathway:
> hsv.3.1 <- pw.corr0.7.H3.1[which(pw.corr0.7.H3.1$pathway == "Herpes simplex virus 1 infection"), ]
> rcs.3.1 <- pw.corr0.7.H3.1[which(pw.corr0.7.H3.1$pathway == "Regulation of cellular response to stress"), ]
> tnf.3.1 <- pw.corr0.7.H3.1[which(pw.corr0.7.H3.1$pathway == "TNF signaling pathway"), ]
> hsv.3.3 <- pw.corr0.7.H3.3[which(pw.corr0.7.H3.3$pathway == "Herpes simplex virus 1 infection"), ]
> rcs.3.3 <- pw.corr0.7.H3.3[which(pw.corr0.7.H3.3$pathway == "Regulation of cellular response to stress"), ]
> tnf.3.3 <- pw.corr0.7.H3.3[which(pw.corr0.7.H3.3$pathway == "TNF signaling pathway"), ]
> 
> corr.hsv <- rbind(hsv.3.1, hsv.3.3)
> corr.rcs <- rbind(rcs.3.1, rcs.3.3)
> corr.tnf <- rbind(tnf.3.1, tnf.3.3)
> 
> # order factors levels by correlation:
> hsv.sum <- hsv.3.1$R + hsv.3.3$R
> rcs.sum <- rcs.3.1$R + rcs.3.3$R
> tnf.sum <- tnf.3.1$R + tnf.3.3$R
> 
> corr.hsv <- mutate(corr.hsv, symbol = fct_relevel(symbol, hsv.sum %>% sort %>% names))
> corr.rcs <- mutate(corr.rcs, symbol = fct_relevel(symbol, rcs.sum %>% sort %>% names))
> corr.tnf <- mutate(corr.tnf, symbol = fct_relevel(symbol, tnf.sum %>% sort %>% names))

The correlation of genes with histone acetylation were then plotted for each pathway separately.

> ggplot() +
+   geom_col(data = corr.hsv, aes(x = symbol, y = R, fill = experiment), position = "dodge") +
+   facet_wrap(~ pathway) +
+   labs(x = "gene") +
+   theme_bw() +
+   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+   scale_fill_manual(values = c("#FFCC33", "#CC3366"))

> ggplot() +
+   geom_col(data = corr.rcs, aes(x = symbol, y = R, fill = experiment), position = "dodge") +
+   facet_wrap(~ pathway) +
+   labs(x = "gene") +
+   theme_bw() +
+   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+   scale_fill_manual(values = c("#FFCC33", "#CC3366"))

> ggplot() +
+   geom_col(data = corr.tnf, aes(x = symbol, y = R, fill = experiment), position = "dodge") +
+   facet_wrap(~ pathway) +
+   labs(x = "gene") +
+   theme_bw() +
+   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+   scale_fill_manual(values = c("#FFCC33", "#CC3366"))


References - packages


Session Information

The following versions of R and R packages were used to generate the report above:

## R version 4.3.0 (2023-04-21)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_1.0.0               stringr_1.5.0              
##  [3] readxl_1.4.2                DESeq2_1.40.1              
##  [5] SummarizedExperiment_1.30.2 MatrixGenerics_1.12.2      
##  [7] matrixStats_1.0.0           ggplot2_3.4.2              
##  [9] dplyr_1.1.2                 GenomicFeatures_1.52.0     
## [11] AnnotationDbi_1.62.1        Biobase_2.60.0             
## [13] GenomicRanges_1.52.0        GenomeInfoDb_1.36.0        
## [15] IRanges_2.34.0              S4Vectors_0.38.1           
## [17] BiocGenerics_0.46.0         report_0.5.7               
## [19] rmarkdown_2.23              knitr_1.43                 
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0         farver_2.1.1             blob_1.2.4              
##  [4] filelock_1.0.2           Biostrings_2.68.1        bitops_1.0-7            
##  [7] fastmap_1.1.1            RCurl_1.98-1.12          BiocFileCache_2.8.0     
## [10] GenomicAlignments_1.36.0 XML_3.99-0.14            digest_0.6.31           
## [13] lifecycle_1.0.3          KEGGREST_1.40.0          RSQLite_2.3.1           
## [16] magrittr_2.0.3           compiler_4.3.0           rlang_1.1.1             
## [19] sass_0.4.6               progress_1.2.2           tools_4.3.0             
## [22] utf8_1.2.3               yaml_2.3.7               rtracklayer_1.60.0      
## [25] labeling_0.4.2           prettyunits_1.1.1        S4Arrays_1.0.4          
## [28] bit_4.0.5                curl_5.0.1               DelayedArray_0.26.3     
## [31] xml2_1.3.4               BiocParallel_1.34.2      withr_2.5.0             
## [34] grid_4.3.0               fansi_1.0.4              colorspace_2.1-0        
## [37] scales_1.2.1             biomaRt_2.56.1           insight_0.19.3          
## [40] cli_3.6.1                crayon_1.5.2             generics_0.1.3          
## [43] rstudioapi_0.14          httr_1.4.6               rjson_0.2.21            
## [46] DBI_1.1.3                cachem_1.0.8             zlibbioc_1.46.0         
## [49] parallel_4.3.0           cellranger_1.1.0         XVector_0.40.0          
## [52] restfulr_0.0.15          vctrs_0.6.3              Matrix_1.5-4.1          
## [55] jsonlite_1.8.5           hms_1.1.3                bit64_4.0.5             
## [58] locfit_1.5-9.8           jquerylib_0.1.4          glue_1.6.2              
## [61] codetools_0.2-19         stringi_1.7.12           gtable_0.3.3            
## [64] BiocIO_1.10.0            munsell_0.5.0            tibble_3.2.1            
## [67] pillar_1.9.0             rappdirs_0.3.3           htmltools_0.5.5         
## [70] GenomeInfoDbData_1.2.10  R6_2.5.1                 dbplyr_2.3.2            
## [73] evaluate_0.21            lattice_0.21-8           highr_0.10              
## [76] png_0.1-8                Rsamtools_2.16.0         memoise_2.0.1           
## [79] bslib_0.5.0              Rcpp_1.0.10              xfun_0.39               
## [82] pkgconfig_2.0.3